Application 6: Visualizing Correlations and Models

Author

Erik Westlund

Published

June 12, 2025

# List of required packages
required_packages <- c(
  "dplyr",
  "ggplot2",
  "broom.mixed",
  "lme4",
  "readr",
  "emmeans"
)

# Install missing packages
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) install.packages(new_packages)

# Load all packages
library(dplyr)
library(ggplot2)
library(broom.mixed)
library(lme4)
library(readr)
library(emmeans)

source(here::here("examples", "colors.R"))
set.seed(123)

Looking At Data: Correlations

Correlations are indisposable for understanding the relationship between two variables, but they can be misleading as we show below.

This is adapted directly from Jan Vanhove.

plot_r <- function(df, showSmoother = TRUE, smoother = "lm") {
  p <- ggplot(df, aes(x = x, y = y)) +
    geom_point(alpha = 0.7)
  
  if(showSmoother) {
    p <- p +
    geom_smooth(
      formula = y ~ x,
      method = smoother,
      color = colors$green$`500`,
      fill = colors$slate$`300`,
      alpha = 0.3,
    )
  }

  p <- p +
    facet_wrap(~title, scales = "free", ncol = 2) +
    theme_minimal(base_size = 12) +
    theme(
      strip.text = element_text(face = "bold", size = 14),
      axis.title = element_blank(),
      plot.title = element_text(size = 24, face = "bold", hjust = 0.5),
      panel.grid.minor = element_blank(),
      axis.text = element_text(size = 10),
      panel.spacing = unit(2, "lines")
    )
  
  p
}

corr_r <- function(r = 0.6, n = 50) {
  
  compute.y <- function(x, y, r) {
    theta <- acos(r)
    X <- cbind(x, y)
    Xctr <- scale(X, center = TRUE, scale = FALSE)    # Centered variables (mean 0)
    Id <- diag(n)                                     # Identity matrix
    Q <- qr.Q(qr(Xctr[, 1, drop = FALSE]))            # QR decomposition
    P <- tcrossprod(Q)                                # Projection onto space defined by x1
    x2o <- (Id - P) %*% Xctr[, 2]                     # x2ctr made orthogonal to x1ctr
    Xc2 <- cbind(Xctr[, 1], x2o)
    Y <- Xc2 %*% diag(1 / sqrt(colSums(Xc2 ^ 2)))
    y <- Y[, 2] + (1 / tan(theta)) * Y[, 1]
    return(y)
  }
  
  cases <- list(
    list(id = 1, title = "(1) Normal x, normal residuals", x = rnorm(n), y = rnorm(n)),
    list(id = 2, title = "(2) Uniform x, normal residuals", x = runif(n, 0, 1), y = rnorm(n)),
    list(id = 3, title = "(3) +-skewed x, normal residuals", x = rlnorm(n, 5), y = rnorm(n)),
    list(id = 4, title = "(4) --skewed x, normal residuals", x = rlnorm(n, 5) * -1 + 5000, y = rnorm(n)),
    list(id = 5, title = "(5) Normal x, +-skewed residuals", x = rnorm(n), y = rlnorm(n, 5)),
    list(id = 6, title = "(6) Normal x, --skewed residuals", x = rnorm(n), y = -rlnorm(n, 5)),
    list(id = 7, title = "(7) Increasing spread", 
         x = sort(rnorm(n)) + abs(min(rnorm(n))), 
         y = rnorm(n, 0, sqrt(abs(10 * sort(rnorm(n)))))),
    list(id = 8, title = "(8) Decreasing spread", 
         x = sort(rnorm(n)) + abs(min(rnorm(n))), 
         y = rnorm(n, 0, sqrt(pmax(0.1, abs(10 * max(sort(rnorm(n))) - 10 * sort(rnorm(n))))))),
    list(id = 9, title = "(9) Quadratic trend", x = rnorm(n), y = rnorm(n) ^ 2),
    list(id = 10, title = "(10) Sinusoid relationship", x = runif(n, -2 * pi, 2 * pi), y = sin(runif(n, -2 * pi, 2 * pi))),
    list(id = 11, title = "(11) A single positive outlier", x = c(rnorm(n - 1), 10), y = c(rnorm(n - 1), 15)),
    list(id = 12, title = "(12) A single negative outlier", x = c(rnorm(n - 1), 10), y = c(rnorm(n - 1), -15)),
    list(id = 13, title = "(13) Bimodal residuals", x = rnorm(n), y = c(rnorm(floor(n / 2), mean = -3), rnorm(ceiling(n / 2), 3))),
    list(id = 14, title = "(14) Two groups", 
         x = c(rnorm(floor(n / 2), -3), rnorm(ceiling(n / 2), 3)), 
         y = c(rnorm(floor(n / 2), mean = 3), rnorm(ceiling(n / 2), mean = -3))),
    list(id = 15, title = "(15) Sampling at the extremes", 
         x = c(rnorm(floor(n / 2)), rnorm(ceiling(n / 2), mean = 10)), 
         y = rnorm(n)),
    list(id = 16, title = "(16) Categorical data", 
         x = sample(1:5, n, replace = TRUE), 
         y = sample(1:7, n, replace = TRUE))
  )
  
  df <- bind_rows(lapply(cases, function(case) {
    id = case$id
    x <- case$x
    y <- compute.y(x, case$y, r)
    data.frame(id = id, x = x, y = y, title = case$title)
  }))
  
  df$title <- factor(df$title, levels = paste0("(", 1:16, ") ", c(
    "Normal x, normal residuals",
    "Uniform x, normal residuals",
    "+-skewed x, normal residuals",
    "--skewed x, normal residuals",
    "Normal x, +-skewed residuals",
    "Normal x, --skewed residuals",
    "Increasing spread",
    "Decreasing spread",
    "Quadratic trend",
    "Sinusoid relationship",
    "A single positive outlier",
    "A single negative outlier",
    "Bimodal residuals",
    "Two groups",
    "Sampling at the extremes",
    "Categorical data"
  )))
  

  return(df)
}


data <- corr_r(r=0.3, n=100)

analysis_data <- data |> filter(id == 1)
model <- lm(y ~ x, data = analysis_data)
summary(model)

Call:
lm(formula = y ~ x, data = analysis_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.19848 -0.07113 -0.00910  0.06042  0.34241 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) -0.00313    0.01015  -0.308  0.75846   
x            0.03463    0.01112   3.113  0.00243 **
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.101 on 98 degrees of freedom
Multiple R-squared:   0.09, Adjusted R-squared:  0.08071 
F-statistic: 9.692 on 1 and 98 DF,  p-value: 0.002426
cor(analysis_data$x, analysis_data$y)
[1] 0.3
plot_r(data, showSmoother=FALSE)

plot_r(data, showSmoother=TRUE)

Visualizing Model Outputs

Visualization of model outputs is often neglected, but it can be a powerful way both to undrestand and to communicate the results of a model. A number of packages exist to help with this, including broom.mixed, emmeans, and ggeffects.

Here, we fit a logistic mixed-effects model using the glmer() function to estimate the odds of receiving comprehensive postnatal care. This model accounts for both fixed effects (individual-level predictors like race or insurance status) and a random intercept for provider, which captures unobserved heterogeneity across providers.

# Load data
data <- read_csv(here::here("data", "processed", "simulated_data.csv"))
Rows: 50000 Columns: 20
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (6): state, self_report_income, edu, race_ethnicity, insurance, job_type
dbl (14): id, provider_id, received_comprehensive_postnatal_care, age, depen...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Sample 1/4 of the sites
set.seed(123)
unique_sites <- unique(data$provider_id)
reduced_sites <- sample(unique_sites, length(unique_sites) * 0.10)
data <- data[data$provider_id %in% reduced_sites, ]

# Fit the model
model <- glmer(
  received_comprehensive_postnatal_care ~ 
    race_ethnicity +
    log(distance_to_provider) +
    insurance +
    multiple_gestation + 
    placenta_previa + 
    gest_hypertension + 
    preeclampsia +
    (1 | provider_id),  
  data = data, 
  family = binomial(link = "logit"),
  control = glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e5))
)

# Create variable labels for better visualization
variable_labels <- c(
  "race_ethnicityaian" = "AIAN",
  "race_ethnicityasian" = "Asian",
  "race_ethnicityblack" = "Black",
  "race_ethnicityhispanic" = "Hispanic",
  "race_ethnicitynhpi" = "NHPI",
  "race_ethnicityother" = "Other",
  "log(distance_to_provider)" = "Log(Distance to Provider)",
  "insuranceprivate" = "Insurance: Private",
  "insurancestate_provided" = "Insurance: State-Provided",
  "multiple_gestation" = "Multiple Gestation",
  "placenta_previa" = "Placenta Previa",
  "gest_hypertension" = "Gestational Hypertension",
  "preeclampsia" = "Preeclampsia",
  "(Intercept)" = "Intercept"
)

Below, we extract the fixed-effect estimates using broom.mixed::tidy(), including 95% confidence intervals, and plot them as a coefficient plot (a forest plot for regression coefficients).

Each point shows the estimated log-odds of receiving comprehensive care associated with a given covariate, holding other variables constant. The dashed vertical line at 0 represents no effect.

# Get fixed effects with confidence intervals
fixed_effects <- tidy(model, conf.int = TRUE) %>%
  filter(effect == "fixed") %>%
  # Remove any NA values
  filter(!is.na(estimate)) %>%
  # Create a more readable term name
  mutate(term = case_when(
    term == "(Intercept)" ~ "Intercept",
    term == "age" ~ "Age",
    term == "sexMale" ~ "Male Sex",
    term == "raceBlack" ~ "Black Race",
    term == "raceHispanic" ~ "Hispanic Race",
    term == "raceOther" ~ "Other Race",
    term == "insurancePrivate" ~ "Private Insurance",
    term == "insuranceMedicaid" ~ "Medicaid",
    term == "insuranceOther" ~ "Other Insurance",
    term == "comorbidity_score" ~ "Comorbidity Score",
    term == "emergency_admissionTRUE" ~ "Emergency Admission",
    term == "weekend_admissionTRUE" ~ "Weekend Admission",
    term == "night_admissionTRUE" ~ "Night Admission",
    TRUE ~ term
  ))

# Create the forest plot
ggplot(fixed_effects, aes(x = estimate, y = reorder(term, estimate))) +
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), 
                 height = 0.2, color = colors$slate$`300`) +
  geom_point(size = 3, color = colors$blue$`500`) +
  geom_vline(xintercept = 0, linetype = "dashed", color = colors$red$`600`) +
  labs(
    title = "Fixed Effects from GLMM",
    subtitle = "Each point represents the estimated effect on log-odds of receiving comprehensive care",
    x = "Effect Size (95% CI)",
    y = "Predictor"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 10)
  )

Predicted Probabilities by Covariate Groups

Here, we use the emmeans package to estimate marginal predicted probabilities for selected covariates. These represent the expected probability of receiving comprehensive care for each level of a covariate, averaging over the distribution of other covariates in the model. This approach helps isolate the relationship between each covariate and the outcome while controlling for confounding. We visualize these estimates and their 95% confidence intervals, grouped by domain (e.g., race/ethnicity, insurance, medical conditions).

# Calculate predicted probabilities for each covariate group
library(emmeans)

# Race/Ethnicity
race_probs <- emmeans(model, ~ race_ethnicity, type = "response")
race_probs_df <- as.data.frame(race_probs) %>%
  mutate(
    group = "Race/Ethnicity",
    category = case_when(
      race_ethnicity == "aian" ~ "AIAN",
      race_ethnicity == "asian" ~ "Asian",
      race_ethnicity == "black" ~ "Black",
      race_ethnicity == "hispanic" ~ "Hispanic",
      race_ethnicity == "nhpi" ~ "NHPI",
      race_ethnicity == "other" ~ "Other",
      race_ethnicity == "white" ~ "White"
    )
  )

# Insurance
insurance_probs <- emmeans(model, ~ insurance, type = "response")
insurance_probs_df <- as.data.frame(insurance_probs) %>%
  mutate(
    group = "Insurance",
    category = case_when(
      insurance == "medicaid" ~ "Medicaid",
      insurance == "private" ~ "Private",
      insurance == "state_provided" ~ "State-Provided",
      insurance == "uninsured" ~ "Uninsured",
      TRUE ~ "Other"
    )
  )

# Medical Conditions
conditions_probs <- emmeans(model, ~ multiple_gestation + placenta_previa + gest_hypertension + preeclampsia, type = "response")
conditions_probs_df <- as.data.frame(conditions_probs) %>%
  mutate(
    group = "Medical Conditions",
    category = case_when(
      multiple_gestation == 1 ~ "Multiple Gestation",
      placenta_previa == 1 ~ "Placenta Previa",
      gest_hypertension == 1 ~ "Gestational Hypertension",
      preeclampsia == 1 ~ "Preeclampsia",
      TRUE ~ "No Medical Conditions"
    )
  ) %>%
  # Remove duplicates where multiple conditions are true
  distinct(category, .keep_all = TRUE)

# Combine all predictions
all_probs <- bind_rows(
  race_probs_df %>% select(group, category, prob, asymp.LCL, asymp.UCL),
  insurance_probs_df %>% select(group, category, prob, asymp.LCL, asymp.UCL),
  conditions_probs_df %>% select(group, category, prob, asymp.LCL, asymp.UCL)
)

# Create faceted plot
ggplot(all_probs, aes(x = prob, y = reorder(category, prob))) +
  geom_errorbarh(aes(xmin = asymp.LCL, xmax = asymp.UCL), 
                 height = 0.2, color = colors$slate$`300`) +
  geom_point(size = 3, color = colors$blue$`500`) +
  facet_wrap(~ group, scales = "free_y", ncol = 1) +
  labs(
    title = "Predicted Probability of Receiving Comprehensive Care",
    subtitle = "By Covariate Groups",
    x = "Predicted Probability (95% CI)",
    y = "Category"
  ) +
  scale_x_continuous(labels = scales::percent) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 10),
    strip.text = element_text(size = 12, face = "bold")
  )

Provider-Level Random Effects

We visualize the provider-level random intercepts, which represent the estimated deviation of each provider from the overall mean (intercept) after accounting for the fixed effects in the model. This helps reveal between-provider variability and can identify providers whose outcomes are systematically higher or lower than expected. The dashed vertical line at zero indicates the overall average; providers to the right have higher-than-average effects, and those to the left are lower.

# Get random effects
random_effects <- ranef(model, condVar = TRUE)
random_effects_data <- as.data.frame(random_effects)

# Create the forest plot of random effects
ggplot(random_effects_data, aes(x = condval, y = reorder(grp, condval))) +
  geom_errorbarh(aes(xmin = condval - 1.96*sqrt(attr(random_effects$provider_id, "postVar")[1,1,]), 
                     xmax = condval + 1.96*sqrt(attr(random_effects$provider_id, "postVar")[1,1,])), 
                 height = 0.2, color = colors$slate$`300`) +
  geom_point(size = 3, color = colors$blue$`500`) +
  geom_vline(xintercept = 0, linetype = "dashed", color = colors$red$`600`) +
  labs(
    title = "Random Effects by Provider",
    subtitle = "Each point represents the provider-specific deviation from the overall intercept",
    x = "Provider-Specific Effect (95% CI)",
    y = "Provider ID"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.y = element_blank(),
    plot.title = element_text(hjust = 0.5, size = 16, face = "bold"),
    plot.subtitle = element_text(hjust = 0.5, size = 12),
    axis.text.y = element_text(size = 10)
  )

# Print model summary
summary(model)
Generalized linear mixed model fit by maximum likelihood (Laplace
  Approximation) [glmerMod]
 Family: binomial  ( logit )
Formula: 
received_comprehensive_postnatal_care ~ race_ethnicity + log(distance_to_provider) +  
    insurance + multiple_gestation + placenta_previa + gest_hypertension +  
    preeclampsia + (1 | provider_id)
   Data: data
Control: glmerControl(optimizer = "bobyqa", optCtrl = list(maxfun = 1e+05))

      AIC       BIC    logLik -2*log(L)  df.resid 
   6183.2    6280.7   -3076.6    6153.2      4913 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.1303 -0.7272 -0.6137  1.2288  2.3987 

Random effects:
 Groups      Name        Variance Std.Dev.
 provider_id (Intercept) 0.1589   0.3987  
Number of obs: 4928, groups:  provider_id, 50

Fixed effects:
                           Estimate Std. Error z value Pr(>|z|)  
(Intercept)               -0.905542   0.391365  -2.314   0.0207 *
race_ethnicityasian        0.248827   0.402795   0.618   0.5367  
race_ethnicityblack       -0.059232   0.395410  -0.150   0.8809  
race_ethnicityhispanic     0.055657   0.388852   0.143   0.8862  
race_ethnicitynhpi         0.004071   0.700685   0.006   0.9954  
race_ethnicityother        0.279441   0.544121   0.514   0.6076  
race_ethnicitywhite        0.196744   0.384417   0.512   0.6088  
log(distance_to_provider) -0.023705   0.023838  -0.994   0.3200  
insuranceprivate           0.139653   0.074399   1.877   0.0605 .
insurancestate_provided    0.045824   0.082829   0.553   0.5801  
multiple_gestation         0.175905   0.178027   0.988   0.3231  
placenta_previa            0.309438   0.315725   0.980   0.3270  
gest_hypertension          0.035563   0.137519   0.259   0.7959  
preeclampsia               0.126002   0.224375   0.562   0.5744  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Correlation matrix not shown by default, as p = 14 > 12.
Use print(x, correlation=TRUE)  or
    vcov(x)        if you need it